trips_per_day <- read_tsv("trips_per_day.tsv")
holiday <- read.csv("holiday.csv", header = F, col.names = c("day_num", "ymd", "holiday_name"))
# convert ymd to a date type in holiday so it can be joined
holiday$ymd <- as.Date(holiday$ymd)
# join holiday info to trips_per_day and add is_holiday_column
trips_per_day <- trips_per_day %>%
left_join(holiday, by = "ymd") %>%
mutate(is_holiday = !(is.na(holiday_name))) %>%
select(-holiday_name, -day_num)
#add weekdays as another predictor
trips_per_day <- trips_per_day %>%
mutate(weekday = weekdays(as.POSIXct(ymd), abbreviate = T))
# add flu_season as another predictor
flu_season <- c("12", "01", "02")
trips_per_day <- trips_per_day %>%
mutate(month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-month)
set.seed(42)
# sample split using caTools package
sample <- sample.split(trips_per_day$num_trips, SplitRatio = 0.8)
train_data <- subset(trips_per_day, sample== T)
other_data <- subset(trips_per_day, sample== F)
sample2 <- sample.split(other_data$num_trips, SplitRatio = 0.5)
validate_data <- subset(other_data, sample2== T)
test_data <- subset(other_data, sample2== F)
# Check if the split works
nrow(trips_per_day)==sum(nrow(train_data), nrow(test_data), nrow(validate_data))
## [1] TRUE
K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
# fit on the training data
model <- lm(num_trips ~ poly(tmin, k, raw = T), train_data)
# evaluate on the training data
# RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}
plot_data <- data.frame(K, train_err, validate_err) %>%
gather("split","error",-K)
ggplot(plot_data, aes(x=K, y = error, color = split)) +
geom_line() +
scale_x_continuous(breaks = K) +
xlab('Polynomial Degrees') +
ylab('RMSE')
# fit a model for each polynomial degree consider 7 predictors
K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
# fit on the training data
model <- lm(num_trips ~ poly(prcp, k, raw = T) + poly(snwd, k, raw = T) + poly(tmax, k, raw = T) + poly(tmin, k, raw = T) + weekday + is_holiday + is_flu_season, train_data)
# evaluate on the training data
# RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}
plot_data <- data.frame(K, train_err, validate_err) %>%
gather("split","error",-K)
ggplot(plot_data, aes(x=K, y = error, color = split)) +
geom_line() +
scale_x_continuous(breaks = K) +
xlab('Polynomial Degrees') +
ylab('RMSE')
Polynomial of Degree two gives lowest validation error. Further investigation still nedded.
# Everything at degree 2
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+poly(snwd, 2, raw = T)+poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)
glance(model)
# take out insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)
# check R^2
rsquare(model, train_data)
## [1] 0.8974348
rsquare(model, validate_data)
## [1] 0.8960873
# take out more insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)
#Check rmse and R^2
rmse(model, train_data)
## [1] 3199.978
rmse(model, validate_data)
## [1] 3373.947
rsquare(model, train_data)
## [1] 0.8964213
rsquare(model, validate_data)
## [1] 0.8944062
# Best Model
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)# 0.8976948
rsquare(model, train_data)
## [1] 0.8964213
tidy(model)
# test the model by combining train_data with validate_data
com_data <- rbind(train_data, validate_data)
rsquare(model, com_data)
## [1] 0.8967198
plot_data <- validate_data %>%
add_predictions(model)
# use ggplotly for interactive plot
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
ggplot(plot_data, aes(x=pred, y =num_trips ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')+
ggtitle("Predicted Value against Actual Value ")
rmse(model, test_data)
## [1] 13089.71
# negative R^2 !!
rsquare(model, test_data)
## [1] -0.2801334
# add predictions to test data
plot_test_data<- test_data %>%
add_predictions(model)
# plot ymd versus prediction and actual value
ggplotly(ggplot(plot_test_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
# removed the outlier
plot_data <- test_data %>%
add_predictions(model) %>%
filter(ymd!="2014-04-30")
#Small error R^2 = 0.95
rmse(model, plot_data)
## [1] 2510.775
rsquare(model, plot_data)
## [1] 0.9504549
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))